#imported a bunch of stuff
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import preprocessing as prep
import os
import statistics
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
root_dir = os.getcwd()
df = pd.read_csv(root_dir + "/icu_data.csv")
df
| group | ID | outcome | age | gendera | BMI | hypertensive | atrialfibrillation | CHD with no MI | diabetes | ... | Blood sodium | Blood calcium | Chloride | Anion gap | Magnesium ion | PH | Bicarbonate | Lactic acid | PCO2 | EF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 125047 | 0.0 | 72 | 1 | 37.588179 | 0 | 0 | 0 | 1 | ... | 138.750000 | 7.463636 | 109.166667 | 13.166667 | 2.618182 | 7.230 | 21.166667 | 0.5 | 40.0 | 55 |
| 1 | 1 | 139812 | 0.0 | 75 | 2 | NaN | 0 | 0 | 0 | 0 | ... | 138.888889 | 8.162500 | 98.444444 | 11.444444 | 1.887500 | 7.225 | 33.444444 | 0.5 | 78.0 | 55 |
| 2 | 1 | 109787 | 0.0 | 83 | 2 | 26.572634 | 0 | 0 | 0 | 0 | ... | 140.714286 | 8.266667 | 105.857143 | 10.000000 | 2.157143 | 7.268 | 30.571429 | 0.5 | 71.5 | 35 |
| 3 | 1 | 130587 | 0.0 | 43 | 2 | 83.264629 | 0 | 0 | 0 | 0 | ... | 138.500000 | 9.476923 | 92.071429 | 12.357143 | 1.942857 | 7.370 | 38.571429 | 0.6 | 75.0 | 55 |
| 4 | 1 | 138290 | 0.0 | 75 | 2 | 31.824842 | 1 | 0 | 0 | 0 | ... | 136.666667 | 8.733333 | 104.500000 | 15.166667 | 1.650000 | 7.250 | 22.000000 | 0.6 | 50.0 | 55 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1172 | 2 | 171130 | 0.0 | 62 | 1 | 25.516870 | 1 | 1 | 0 | 1 | ... | 136.714286 | 10.271429 | 94.428571 | 20.142857 | 2.714286 | NaN | 27.714286 | NaN | NaN | 40 |
| 1173 | 2 | 101659 | 0.0 | 78 | 1 | 25.822710 | 0 | 1 | 0 | 1 | ... | 135.680000 | 10.523529 | 101.720000 | 18.160000 | 2.012500 | NaN | 20.480000 | NaN | NaN | 30 |
| 1174 | 2 | 162069 | 0.0 | 85 | 2 | 23.891779 | 1 | 1 | 0 | 1 | ... | 136.000000 | 8.466667 | 97.285714 | 14.000000 | 2.028571 | NaN | 28.857143 | NaN | NaN | 55 |
| 1175 | 2 | 120967 | 0.0 | 79 | 2 | 35.288554 | 0 | 0 | 1 | 1 | ... | 140.000000 | 8.183333 | 104.000000 | 15.750000 | 2.090000 | NaN | 24.375000 | NaN | NaN | 25 |
| 1176 | 2 | 107636 | 0.0 | 47 | 1 | 23.121384 | 1 | 0 | 0 | 1 | ... | 135.285714 | 9.085714 | 93.857143 | 20.285714 | 2.566667 | NaN | 26.571429 | NaN | NaN | 55 |
1177 rows × 51 columns
see where there are null values to deal with missing data
print(df.isnull().sum())
group 0 ID 0 outcome 1 age 0 gendera 0 BMI 215 hypertensive 0 atrialfibrillation 0 CHD with no MI 0 diabetes 0 deficiencyanemias 0 depression 0 Hyperlipemia 0 Renal failure 0 COPD 0 heart rate 13 Systolic blood pressure 16 Diastolic blood pressure 16 Respiratory rate 13 temperature 19 SP O2 13 Urine output 36 hematocrit 0 RBC 0 MCH 0 MCHC 0 MCV 0 RDW 0 Leucocyte 0 Platelets 0 Neutrophils 144 Basophils 259 Lymphocyte 145 PT 20 INR 20 NT-proBNP 0 Creatine kinase 165 Creatinine 0 Urea nitrogen 0 glucose 18 Blood potassium 0 Blood sodium 0 Blood calcium 1 Chloride 0 Anion gap 0 Magnesium ion 0 PH 292 Bicarbonate 0 Lactic acid 229 PCO2 294 EF 0 dtype: int64
Replace NaN with median of the not missing values in the columns function
def replace_nan_median(x, col):
if pd.isna(x):
return int(df[col].median())
return x
calling the replace NaN with median function for all of the features that have missing data
#BMI
df['BMI'] = df['BMI'].apply(lambda x: replace_nan_median(x, 'BMI'))
#heartrate
df['heart rate'] = df['heart rate'].apply(lambda x: replace_nan_median(x, 'heart rate'))
#Systolic blood pressure
df['Systolic blood pressure'] = df['Systolic blood pressure'].apply(lambda x: replace_nan_median(x, 'Systolic blood pressure'))
#Diastolic blood pressure
df['Diastolic blood pressure'] = df['Diastolic blood pressure'].apply(lambda x: replace_nan_median(x, 'Diastolic blood pressure'))
#Respiratory rate
df['Respiratory rate'] = df['Respiratory rate'].apply(lambda x: replace_nan_median(x, 'Respiratory rate'))
#temperature
df['temperature'] = df['temperature'].apply(lambda x: replace_nan_median(x, 'temperature'))
#SP O2
df['SP O2'] = df['SP O2'].apply(lambda x: replace_nan_median(x, 'SP O2'))
#Urine output
df['Urine output'] = df['Urine output'].apply(lambda x: replace_nan_median(x, 'Urine output'))
#PT
df['PT'] = df['PT'].apply(lambda x: replace_nan_median(x, 'PT'))
#INR
df['INR'] = df['INR'].apply(lambda x: replace_nan_median(x, 'INR'))
#glucose
df['glucose'] = df['glucose'].apply(lambda x: replace_nan_median(x, 'glucose'))
#Blood calcium - drop NaN row
df['Blood calcium'] = df['Blood calcium'].apply(lambda x: replace_nan_median(x, 'Blood calcium'))
Dropped all of these columns beacuse they had too much missing data, would have to replace 1/5 of values with median which would mess with results
#Dropped all of these columns beacuse they had to much missing data to
#Creatine kinase - drop col
#PH - drop col
#Lactic acid - drop col
#PCO2 - drop col
#Neutrophils - drop col
#Basophils - drop col
#Lymphocyte = drop col
#drop ID b/c we won't be using it, serves no purpose besides identification which we have with row numbers
df = df.drop(['Creatine kinase', 'PH', 'Lactic acid', 'PCO2', 'Neutrophils', 'Neutrophils', 'Basophils',
'Lymphocyte', 'ID'], axis=1)
print(df.isnull().sum())
group 0 outcome 1 age 0 gendera 0 BMI 0 hypertensive 0 atrialfibrillation 0 CHD with no MI 0 diabetes 0 deficiencyanemias 0 depression 0 Hyperlipemia 0 Renal failure 0 COPD 0 heart rate 0 Systolic blood pressure 0 Diastolic blood pressure 0 Respiratory rate 0 temperature 0 SP O2 0 Urine output 0 hematocrit 0 RBC 0 MCH 0 MCHC 0 MCV 0 RDW 0 Leucocyte 0 Platelets 0 PT 0 INR 0 NT-proBNP 0 Creatinine 0 Urea nitrogen 0 glucose 0 Blood potassium 0 Blood sodium 0 Blood calcium 0 Chloride 0 Anion gap 0 Magnesium ion 0 Bicarbonate 0 EF 0 dtype: int64
#outcome - drop the NaN since we should not assume anything because this is what we are basing predictions on
index_to_drop = df.index[df['outcome'].isna()].tolist()[0]
df = df.drop(index_to_drop, axis=0)
#convert outcome to int instead of floating point, values stay the same just without decimal
df['outcome'] = df['outcome'].astype(int)
#columns with 0/1 values
#group, outcome, gendera,hypertensive, atrialfibrillation, CHD with no MI, diabetes, deficiencyanemias
#depression, hyperlipemia, renal failure, COPD
group_cols = ['group', 'outcome', 'gendera', 'hypertensive', 'atrialfibrillation', 'CHD with no MI',
'diabetes', 'deficiencyanemias', 'depression', 'Hyperlipemia', 'Renal failure', 'COPD']
numerical_cols = []
#populate numerical_cols with all cols that are not in group_cols
for column in df:
if column not in group_cols:
numerical_cols.append(column)
#show boxplot for all columns where outliers are possible (all columns listed in numerical_cols)
for elem in numerical_cols:
df.boxplot([elem])
plt.show()
Chose not to modify any of the outliers because there are not many outliers that are outside the range of possibilty for all of these features. Getting rid of the outliers may impact our results because we want to see if some of these factors are increasing or decreasing accuracy. Later on when selecting features we can get rid of some of the features with many outliers and see if they are worth dealing using.
numerical_cols
['age', 'BMI', 'heart rate', 'Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'temperature', 'SP O2', 'Urine output', 'hematocrit', 'RBC', 'MCH', 'MCHC', 'MCV', 'RDW', 'Leucocyte', 'Platelets', 'PT', 'INR', 'NT-proBNP', 'Creatinine', 'Urea nitrogen', 'glucose', 'Blood potassium', 'Blood sodium', 'Blood calcium', 'Chloride', 'Anion gap', 'Magnesium ion', 'Bicarbonate', 'EF']
numerical_data = [df['age'], df['BMI'], df['heart rate'], df['Systolic blood pressure'],
df['Diastolic blood pressure'], df['Respiratory rate'], df['temperature'], df['SP O2'], df['Urine output'],
df['hematocrit'], df['RBC'], df['MCH'], df['MCHC'], df['MCV'], df['RDW'], df['Leucocyte'], df['Platelets'],
df['PT'], df['INR'], df['NT-proBNP'], df['Creatinine'], df['Urea nitrogen'], df['glucose'],
df['Blood potassium'], df['Blood sodium'], df['Blood calcium'], df['Chloride'], df['Anion gap'],
df['Magnesium ion'], df['Bicarbonate'], df['EF']]
names = ['age','BMI','heart rate', 'Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate',
'temperature', 'SP O2', 'Urine output', 'hematocrit', 'RBC', 'MCH', 'MCHC', 'MCV', 'RDW', 'Leucocyte',
'Platelets', 'PT', 'INR', 'NT-proBNP', 'Creatinine', 'Urea nitrogen', 'glucose', 'Blood potassium',
'Blood sodium', 'Blood calcium', 'Chloride', 'Anion gap', 'Magnesium ion', 'Bicarbonate', 'EF']
df2 = pd.concat(numerical_data, axis=1, keys=names)
Created correlations plots for all numerical data. Anywhere where there is colinearity I wanted to drop that feature so that there was less noise for the ML algorithms.
sns.set_theme(style="whitegrid")
sns.pairplot(df2)
<seaborn.axisgrid.PairGrid at 0x7f970a55f100>